This files contains an example of tuning a Random Forest model with BayesSearchCV.
import pickle
import time
import helpsk as hlp
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
import plotly.io as pio
pio.renderers.default='notebook'
with open('../X_train.pkl', 'rb') as handle:
X_train = pickle.load(handle)
with open('../y_train.pkl', 'rb') as handle:
y_train = pickle.load(handle)
hlp.pandas.numeric_summary(X_train, return_style=True)
| # of Non-Nulls | # of Nulls | % Nulls | # of Zeros | % Zeros | Mean | St Dev. | Coef of Var | Skewness | Kurtosis | Min | 10% | 25% | 50% | 75% | 90% | Max | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| duration | 760 | 40 | 5.0% | 0 | 0.0% | 21.0 | 11.7 | 0.6 | 1.0 | 0.6 | 4.0 | 9.0 | 12.0 | 18.0 | 24.0 | 36.0 | 60.0 |
| credit_amount | 800 | 0 | 0.0% | 38 | 5.0% | 3,203.9 | 2,932.3 | 0.9 | 1.9 | 3.9 | 0.0 | 753.9 | 1,300.8 | 2,236.5 | 3,951.5 | 7,394.6 | 18,424.0 |
| installment_commitment | 800 | 0 | 0.0% | 0 | 0.0% | 3.0 | 1.1 | 0.4 | -0.5 | -1.2 | 1.0 | 1.0 | 2.0 | 3.0 | 4.0 | 4.0 | 4.0 |
| residence_since | 800 | 0 | 0.0% | 0 | 0.0% | 2.9 | 1.1 | 0.4 | -0.3 | -1.4 | 1.0 | 1.0 | 2.0 | 3.0 | 4.0 | 4.0 | 4.0 |
| age | 800 | 0 | 0.0% | 0 | 0.0% | 35.6 | 11.4 | 0.3 | 1.0 | 0.7 | 19.0 | 23.0 | 27.0 | 33.0 | 42.0 | 52.0 | 75.0 |
| existing_credits | 800 | 0 | 0.0% | 0 | 0.0% | 1.4 | 0.6 | 0.4 | 1.3 | 1.6 | 1.0 | 1.0 | 1.0 | 1.0 | 2.0 | 2.0 | 4.0 |
| num_dependents | 800 | 0 | 0.0% | 0 | 0.0% | 1.1 | 0.3 | 0.3 | 2.0 | 2.1 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 2.0 | 2.0 |
hlp.pandas.non_numeric_summary(X_train, return_style=True)
| # of Non-Nulls | # of Nulls | % Nulls | Most Freq. Value | # of Unique | % Unique | |
|---|---|---|---|---|---|---|
| checking_status | 763 | 37 | 4.6% | no checking | 4 | 0.5% |
| credit_history | 800 | 0 | 0.0% | existing paid | 5 | 0.6% |
| purpose | 800 | 0 | 0.0% | radio/tv | 10 | 1.2% |
| savings_status | 800 | 0 | 0.0% | <100 | 5 | 0.6% |
| employment | 800 | 0 | 0.0% | 1<=X<4 | 5 | 0.6% |
| personal_status | 800 | 0 | 0.0% | male single | 4 | 0.5% |
| other_parties | 800 | 0 | 0.0% | none | 3 | 0.4% |
| property_magnitude | 800 | 0 | 0.0% | car | 4 | 0.5% |
| other_payment_plans | 800 | 0 | 0.0% | none | 3 | 0.4% |
| housing | 800 | 0 | 0.0% | own | 3 | 0.4% |
| job | 800 | 0 | 0.0% | skilled | 4 | 0.5% |
| own_telephone | 800 | 0 | 0.0% | none | 2 | 0.2% |
| foreign_worker | 800 | 0 | 0.0% | yes | 2 | 0.2% |
y_train[0:10]
array([1, 1, 0, 1, 0, 1, 0, 1, 1, 0])
np.unique(y_train, return_counts=True)
(array([0, 1]), array([559, 241]))
np.unique(y_train, return_counts=True)[1] / np.sum(np.unique(y_train, return_counts=True)[1])
array([0.69875, 0.30125])
from sklearn.preprocessing import OrdinalEncoder
OrdinalEncoder().fit_transform(X_train[['purpose', 'savings_status']])
array([[0., 2.],
[2., 2.],
[9., 1.],
...,
[9., 3.],
[6., 4.],
[6., 2.]])
numeric_columns = hlp.pandas.get_numeric_columns(X_train)
non_numeric_columns = hlp.pandas.get_non_numeric_columns(X_train)
print(numeric_columns)
print(non_numeric_columns)
['duration', 'credit_amount', 'installment_commitment', 'residence_since', 'age', 'existing_credits', 'num_dependents'] ['checking_status', 'credit_history', 'purpose', 'savings_status', 'employment', 'personal_status', 'other_parties', 'property_magnitude', 'other_payment_plans', 'housing', 'job', 'own_telephone', 'foreign_worker']
numeric_pipeline = Pipeline([
#tune whether or not we want to impute or simply remove rows with missing values
('imputer', hlp.sklearn_pipeline.TransformerChooser()),
# this is here so that we can select between MinMax and Scaler
# if this pipeline is ran in a context outside of tuning, no transformation will take place
('scaler', hlp.sklearn_pipeline.TransformerChooser()),
])
non_numeric_pipeline = Pipeline([
('encoder', hlp.sklearn_pipeline.TransformerChooser()),
])
from sklearn.compose import ColumnTransformer
transformations_pipeline = ColumnTransformer([
('numeric', numeric_pipeline, numeric_columns),
('non_numeric', non_numeric_pipeline, non_numeric_columns)
])
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import KernelPCA
random_forest_model = RandomForestClassifier(random_state=42)
full_pipeline = Pipeline([
('prep', transformations_pipeline),
# ('pca', KernelPCA()),
# ('pca', hlp.sklearn_pipeline.TransformerChooser()),
('model', random_forest_model)
])
# Show the levels of pipelines/transformers/model
full_pipeline.named_steps
{'prep': ColumnTransformer(transformers=[('numeric',
Pipeline(steps=[('imputer',
TransformerChooser()),
('scaler',
TransformerChooser())]),
['duration', 'credit_amount',
'installment_commitment', 'residence_since',
'age', 'existing_credits',
'num_dependents']),
('non_numeric',
Pipeline(steps=[('encoder',
TransformerChooser())]),
['checking_status', 'credit_history',
'purpose', 'savings_status', 'employment',
'personal_status', 'other_parties',
'property_magnitude', 'other_payment_plans',
'housing', 'job', 'own_telephone',
'foreign_worker'])]),
'model': RandomForestClassifier(random_state=42)}
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score # , roc_auc_score
from sklearn.metrics import SCORERS
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html
# https://stackoverflow.com/questions/60615281/different-result-roc-auc-score-and-plot-roc-curve
scores = {
# https://github.com/scikit-learn/scikit-learn/blob/2beed5584/sklearn/metrics/_scorer.py#L537
'ROC/AUC': SCORERS['roc_auc'],
'F1': make_scorer(f1_score, greater_is_better=True),
'Pos. Pred. Val': make_scorer(precision_score, greater_is_better=True),
'True Pos. Rate': make_scorer(recall_score, greater_is_better=True),
}
num_folds = 5
num_repeats = 2
# pip install scikit-optimize
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.model_selection import RepeatedKFold
search_space = {
'prep__numeric__imputer__transformer': Categorical([SimpleImputer(strategy='mean')]),
'prep__numeric__scaler__transformer': Categorical([
None,
MinMaxScaler(),
StandardScaler()
]),
'prep__non_numeric__encoder__transformer': Categorical([
OneHotEncoder(),
hlp.sklearn_pipeline.CustomOrdinalEncoder()
]),
# 'pca__transformer': Categorical([
# None,
# KernelPCA(n_components=5, kernel='rbf'),
# KernelPCA(n_components=5, kernel='sigmoid'),
# KernelPCA(n_components=5, kernel='linear'),
# ]),
# 'pca__n_components': Integer(3, X_train.shape[1]),
# 'pca__gamma': Real(0.03, 0.05),
# 'pca__kernel': Categorical(['rbf', 'sigmoid']),
'model__n_estimators': Integer(50, 5000),
# If float, then max_features is a fraction and round(max_features * n_features) features are considered at each split.
'model__max_features': Real(.001, .99),
}
bayes_search = BayesSearchCV(
estimator=full_pipeline,
search_spaces=search_space,
n_iter=50,
cv=RepeatedKFold(n_splits=num_folds, n_repeats=num_repeats),
scoring='roc_auc',
#return_train_score=True,
n_jobs=-1,
verbose=1,
random_state=42,
)
start_time = time.time()
bayes_search.fit(X_train, y_train)
elapsed_time = time.time() - start_time
del search_space
Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits
/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.
Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits
print(f"Elapsed time to run BayesSearchCV: {elapsed_time:.3f} seconds; {elapsed_time / 60:.1f} minutes")
Elapsed time to run BayesSearchCV: 517.029 seconds; 8.6 minutes
print(bayes_search.cv_results_)
{'mean_fit_time': array([ 6.3377691 , 12.61328192, 9.83223152, 1.99773061, 4.71405914,
12.63259985, 9.39273531, 10.47257104, 8.08451636, 4.86580131,
2.67155662, 6.27625751, 5.65203233, 0.08725915, 0.11712933,
4.96601081, 2.81815062, 6.04031501, 5.88370337, 6.04077718,
0.0851758 , 4.58678215, 4.40100007, 3.36408961, 8.21608126,
5.38535712, 0.28405123, 6.83285525, 6.16183512, 0.09648895,
4.68545451, 3.00817547, 15.69353282, 3.24381421, 2.87393954,
3.0361279 , 3.9013505 , 5.97755063, 12.88534379, 2.2311533 ,
4.078633 , 0.16469088, 2.17728469, 2.51503088, 4.00390766,
4.26460209, 8.43094141, 0.30152497, 0.21109216, 4.27965353]), 'std_fit_time': array([1.04496673, 1.75292197, 1.55755073, 0.28541007, 0.69495193,
1.83809892, 1.36159324, 1.50957589, 1.08866071, 0.81153454,
0.43832068, 0.96645453, 0.98827516, 0.00999386, 0.0126077 ,
0.80076678, 0.43477836, 0.97863545, 0.98489456, 0.96301633,
0.016284 , 0.71698066, 0.74072023, 0.56500841, 1.27980534,
0.87600623, 0.04732614, 1.28486966, 0.99137869, 0.01462773,
0.76251385, 0.45324575, 2.14052777, 0.51287317, 0.46654645,
0.49328505, 0.65147326, 1.0179914 , 2.13028735, 0.37844407,
0.66550946, 0.01757908, 0.3941849 , 0.44670316, 0.60840104,
0.79772123, 1.23189501, 0.04958216, 0.03144109, 0.79018461]), 'mean_score_time': array([0.34406464, 0.35878899, 0.46091964, 0.08771956, 0.18720131,
0.40799849, 0.31672559, 0.41627972, 0.28911581, 0.43177924,
0.21572506, 0.53013544, 0.50153792, 0.01094034, 0.01249387,
0.34403062, 0.18180974, 0.5274169 , 0.54346461, 0.52679751,
0.01326249, 0.2953532 , 0.34906182, 0.26165347, 0.43141994,
0.39988139, 0.01831119, 0.61786404, 0.55409122, 0.02048793,
0.3815825 , 0.13870585, 0.40274615, 0.2543237 , 0.21500859,
0.23036087, 0.35296335, 0.49400654, 0.43995385, 0.16624279,
0.35646045, 0.02449212, 0.16621239, 0.1966116 , 0.32967532,
0.32575653, 0.38328531, 0.03596561, 0.02742188, 0.30752115]), 'std_score_time': array([0.06201504, 0.05772105, 0.10280728, 0.01537402, 0.03244367,
0.07318998, 0.05096711, 0.08130741, 0.04573985, 0.08928277,
0.04192221, 0.12016399, 0.11256779, 0.00502726, 0.00715396,
0.07286157, 0.03012782, 0.11654759, 0.12987298, 0.11731768,
0.00520617, 0.05249781, 0.06934819, 0.0491329 , 0.07780806,
0.08644717, 0.01216805, 0.17486647, 0.13191023, 0.0041861 ,
0.07340752, 0.02298264, 0.06689111, 0.04702289, 0.03759065,
0.04136662, 0.10485752, 0.08495576, 0.08069624, 0.0277303 ,
0.08159427, 0.01156586, 0.03393668, 0.03845845, 0.06924794,
0.06624959, 0.12996785, 0.01630949, 0.00543579, 0.05514542]), 'param_model__max_features': masked_array(data=[0.4065928153059274, 0.8291770836421982,
0.44093935466583917, 0.8044596324854324,
0.7917583537494942, 0.7269537097877271,
0.6112914855587909, 0.5384256339457191,
0.9459677284605191, 0.004590373874941051, 0.001,
0.13401834911815705, 0.001, 0.14949426456739603,
0.4345692733009416, 0.12440555380397834,
0.21616910419499824, 0.0037119355318561793, 0.001,
0.001, 0.001, 0.2006888575265783, 0.030963456913936187,
0.001, 0.23899569622964326, 0.10935668641247999,
0.9824743170685183, 0.001, 0.001,
0.0020146239870325287, 0.001, 0.32990376873180677,
0.9839502331380167, 0.0021310294730539645,
0.0035564688932104276, 0.0025135149647515145, 0.001,
0.001313943336839787, 0.99, 0.002885414697718809,
0.001, 0.9883375232379773, 0.0014958882940029122,
0.0012921076767968811, 0.001, 0.001,
0.9849753326424058, 0.0014057642452401946,
0.017381611125871273, 0.001],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False],
fill_value='?',
dtype=object), 'param_model__n_estimators': masked_array(data=[3652, 4422, 4598, 901, 2218, 4700, 3888, 4604, 3518,
4098, 2219, 4977, 4571, 50, 50, 3452, 2021, 4990, 4805,
5000, 50, 3245, 3603, 2733, 4964, 4128, 71, 5000, 5000,
59, 3865, 1619, 4900, 2683, 2286, 2487, 3205, 4871,
5000, 1729, 3217, 58, 1586, 2004, 3269, 3263, 2382,
217, 150, 3272],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False],
fill_value='?',
dtype=object), 'param_prep__non_numeric__encoder__transformer': masked_array(data=[CustomOrdinalEncoder(), OneHotEncoder(),
OneHotEncoder(), CustomOrdinalEncoder(),
CustomOrdinalEncoder(), OneHotEncoder(),
OneHotEncoder(), OneHotEncoder(),
CustomOrdinalEncoder(), CustomOrdinalEncoder(),
CustomOrdinalEncoder(), CustomOrdinalEncoder(),
OneHotEncoder(), OneHotEncoder(), OneHotEncoder(),
OneHotEncoder(), CustomOrdinalEncoder(),
OneHotEncoder(), CustomOrdinalEncoder(),
CustomOrdinalEncoder(), OneHotEncoder(),
CustomOrdinalEncoder(), OneHotEncoder(),
OneHotEncoder(), OneHotEncoder(),
CustomOrdinalEncoder(), OneHotEncoder(),
CustomOrdinalEncoder(), CustomOrdinalEncoder(),
CustomOrdinalEncoder(), CustomOrdinalEncoder(),
OneHotEncoder(), OneHotEncoder(),
CustomOrdinalEncoder(), CustomOrdinalEncoder(),
OneHotEncoder(), OneHotEncoder(), OneHotEncoder(),
CustomOrdinalEncoder(), CustomOrdinalEncoder(),
CustomOrdinalEncoder(), CustomOrdinalEncoder(),
CustomOrdinalEncoder(), OneHotEncoder(),
CustomOrdinalEncoder(), CustomOrdinalEncoder(),
OneHotEncoder(), CustomOrdinalEncoder(),
CustomOrdinalEncoder(), CustomOrdinalEncoder()],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False],
fill_value='?',
dtype=object), 'param_prep__numeric__imputer__transformer': masked_array(data=[SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer()],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False],
fill_value='?',
dtype=object), 'param_prep__numeric__scaler__transformer': masked_array(data=[MinMaxScaler(), StandardScaler(), None, MinMaxScaler(),
StandardScaler(), StandardScaler(), MinMaxScaler(),
MinMaxScaler(), MinMaxScaler(), MinMaxScaler(),
StandardScaler(), StandardScaler(), None,
MinMaxScaler(), None, None, MinMaxScaler(), None,
MinMaxScaler(), MinMaxScaler(), None, MinMaxScaler(),
StandardScaler(), None, StandardScaler(), None,
MinMaxScaler(), MinMaxScaler(), None, None,
StandardScaler(), StandardScaler(), StandardScaler(),
MinMaxScaler(), MinMaxScaler(), None, MinMaxScaler(),
StandardScaler(), None, StandardScaler(),
StandardScaler(), MinMaxScaler(), MinMaxScaler(),
MinMaxScaler(), StandardScaler(), StandardScaler(),
MinMaxScaler(), MinMaxScaler(), StandardScaler(),
StandardScaler()],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False],
fill_value='?',
dtype=object), 'params': [OrderedDict([('model__max_features', 0.4065928153059274), ('model__n_estimators', 3652), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__max_features', 0.8291770836421982), ('model__n_estimators', 4422), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__max_features', 0.44093935466583917), ('model__n_estimators', 4598), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', None)]), OrderedDict([('model__max_features', 0.8044596324854324), ('model__n_estimators', 901), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__max_features', 0.7917583537494942), ('model__n_estimators', 2218), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__max_features', 0.7269537097877271), ('model__n_estimators', 4700), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__max_features', 0.6112914855587909), ('model__n_estimators', 3888), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__max_features', 0.5384256339457191), ('model__n_estimators', 4604), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__max_features', 0.9459677284605191), ('model__n_estimators', 3518), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__max_features', 0.004590373874941051), ('model__n_estimators', 4098), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__max_features', 0.001), ('model__n_estimators', 2219), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__max_features', 0.13401834911815705), ('model__n_estimators', 4977), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__max_features', 0.001), ('model__n_estimators', 4571), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', None)]), OrderedDict([('model__max_features', 0.14949426456739603), ('model__n_estimators', 50), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__max_features', 0.4345692733009416), ('model__n_estimators', 50), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', None)]), OrderedDict([('model__max_features', 0.12440555380397834), ('model__n_estimators', 3452), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', None)]), OrderedDict([('model__max_features', 0.21616910419499824), ('model__n_estimators', 2021), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__max_features', 0.0037119355318561793), ('model__n_estimators', 4990), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', None)]), OrderedDict([('model__max_features', 0.001), ('model__n_estimators', 4805), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__max_features', 0.001), ('model__n_estimators', 5000), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__max_features', 0.001), ('model__n_estimators', 50), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', None)]), OrderedDict([('model__max_features', 0.2006888575265783), ('model__n_estimators', 3245), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__max_features', 0.030963456913936187), ('model__n_estimators', 3603), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__max_features', 0.001), ('model__n_estimators', 2733), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', None)]), OrderedDict([('model__max_features', 0.23899569622964326), ('model__n_estimators', 4964), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__max_features', 0.10935668641247999), ('model__n_estimators', 4128), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', None)]), OrderedDict([('model__max_features', 0.9824743170685183), ('model__n_estimators', 71), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__max_features', 0.001), ('model__n_estimators', 5000), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__max_features', 0.001), ('model__n_estimators', 5000), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', None)]), OrderedDict([('model__max_features', 0.0020146239870325287), ('model__n_estimators', 59), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', None)]), OrderedDict([('model__max_features', 0.001), ('model__n_estimators', 3865), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__max_features', 0.32990376873180677), ('model__n_estimators', 1619), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__max_features', 0.9839502331380167), ('model__n_estimators', 4900), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__max_features', 0.0021310294730539645), ('model__n_estimators', 2683), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__max_features', 0.0035564688932104276), ('model__n_estimators', 2286), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__max_features', 0.0025135149647515145), ('model__n_estimators', 2487), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', None)]), OrderedDict([('model__max_features', 0.001), ('model__n_estimators', 3205), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__max_features', 0.001313943336839787), ('model__n_estimators', 4871), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__max_features', 0.99), ('model__n_estimators', 5000), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', None)]), OrderedDict([('model__max_features', 0.002885414697718809), ('model__n_estimators', 1729), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__max_features', 0.001), ('model__n_estimators', 3217), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__max_features', 0.9883375232379773), ('model__n_estimators', 58), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__max_features', 0.0014958882940029122), ('model__n_estimators', 1586), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__max_features', 0.0012921076767968811), ('model__n_estimators', 2004), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__max_features', 0.001), ('model__n_estimators', 3269), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__max_features', 0.001), ('model__n_estimators', 3263), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__max_features', 0.9849753326424058), ('model__n_estimators', 2382), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__max_features', 0.0014057642452401946), ('model__n_estimators', 217), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__max_features', 0.017381611125871273), ('model__n_estimators', 150), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__max_features', 0.001), ('model__n_estimators', 3272), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())])], 'split0_test_score': array([0.76725782, 0.76543899, 0.77256039, 0.78240485, 0.77082251,
0.76811594, 0.7427836 , 0.8125551 , 0.70573635, 0.74913719,
0.75813088, 0.72981771, 0.77136752, 0.73663288, 0.78681743,
0.79276538, 0.77797101, 0.72443917, 0.73912634, 0.80357143,
0.78036364, 0.78173981, 0.82793605, 0.73345455, 0.70361635,
0.72451637, 0.705 , 0.77645455, 0.81166766, 0.7597904 ,
0.80822821, 0.69014211, 0.72364812, 0.8095806 , 0.75019592,
0.74196366, 0.76765204, 0.79330608, 0.74960815, 0.79830851,
0.70451945, 0.74554146, 0.77223746, 0.82696291, 0.79580307,
0.74847445, 0.83158669, 0.73549965, 0.72286442, 0.77403927]), 'split1_test_score': array([0.65341258, 0.73785266, 0.70852947, 0.72482807, 0.75024846,
0.72440601, 0.75452196, 0.76521739, 0.77141721, 0.69240669,
0.72956731, 0.73628126, 0.77990338, 0.83653846, 0.72329472,
0.75118182, 0.81690909, 0.74443832, 0.80787037, 0.73246753,
0.77151196, 0.73637317, 0.7481121 , 0.75623219, 0.76086957,
0.81808891, 0.74103738, 0.72330014, 0.68036404, 0.74813871,
0.79061103, 0.81855072, 0.75767604, 0.79943792, 0.80009091,
0.73746566, 0.70170455, 0.7439277 , 0.70670063, 0.79826087,
0.79984099, 0.72969108, 0.75707899, 0.76014241, 0.76006054,
0.78661327, 0.67711195, 0.86001176, 0.74854545, 0.76492754]), 'split2_test_score': array([0.83796296, 0.74510305, 0.76914492, 0.75515374, 0.72538128,
0.77326468, 0.7966538 , 0.71442426, 0.78495856, 0.7765368 ,
0.80164251, 0.74990465, 0.78516738, 0.74266954, 0.74492754,
0.75267094, 0.74123044, 0.76812306, 0.72556957, 0.81525454,
0.70560386, 0.73472727, 0.74011429, 0.77209443, 0.78966184,
0.74557742, 0.72699176, 0.74909939, 0.79359901, 0.68108974,
0.76411105, 0.75263367, 0.78354204, 0.80418182, 0.77697173,
0.75706845, 0.84008658, 0.72545455, 0.743 , 0.71676779,
0.77157582, 0.71003135, 0.75296554, 0.75618182, 0.73558455,
0.76905271, 0.67912946, 0.70885427, 0.82917874, 0.72154545]), 'split3_test_score': array([0.76491667, 0.69101864, 0.79913587, 0.70330026, 0.72302768,
0.745 , 0.72117287, 0.69951691, 0.706045 , 0.82523148,
0.72209091, 0.79516908, 0.76917651, 0.74239971, 0.75993884,
0.79288412, 0.72291667, 0.7728833 , 0.78413318, 0.74446987,
0.73388635, 0.7332652 , 0.75369162, 0.78309091, 0.74773512,
0.75795251, 0.75028604, 0.78338914, 0.78536442, 0.7872151 ,
0.67829901, 0.79119459, 0.77017834, 0.72927083, 0.76563592,
0.768221 , 0.71815741, 0.7292412 , 0.75372727, 0.80929289,
0.791 , 0.68926407, 0.76991758, 0.73393417, 0.77996368,
0.78004892, 0.76110811, 0.79909726, 0.74209091, 0.8041958 ]), 'split4_test_score': array([0.7582728 , 0.76811594, 0.76567398, 0.79862549, 0.77440706,
0.73569794, 0.7251333 , 0.79903382, 0.77463636, 0.82253813,
0.78032037, 0.78781554, 0.76933672, 0.78751959, 0.74260968,
0.77480587, 0.78753247, 0.7826087 , 0.79128224, 0.70301291,
0.74008111, 0.77396953, 0.71854545, 0.75578987, 0.82950159,
0.73036961, 0.70433313, 0.76683408, 0.75845798, 0.78849922,
0.76790336, 0.70003356, 0.71909341, 0.71928404, 0.76975845,
0.77950837, 0.77217742, 0.80377143, 0.75709751, 0.77342995,
0.7781808 , 0.70523442, 0.74552613, 0.74305997, 0.78952991,
0.77598887, 0.7759833 , 0.70251208, 0.78694801, 0.78180804]), 'split5_test_score': array([0.71168754, 0.77358491, 0.65748588, 0.78560386, 0.74505743,
0.80532787, 0.74875356, 0.79005246, 0.75818182, 0.78127384,
0.80272727, 0.74485126, 0.78412728, 0.77993898, 0.67799964,
0.78565374, 0.76916476, 0.73445843, 0.8182227 , 0.73511905,
0.69 , 0.80528652, 0.81105253, 0.73377619, 0.76539258,
0.77938988, 0.76774619, 0.69571612, 0.77400966, 0.74176238,
0.71014493, 0.72950038, 0.70677908, 0.75827991, 0.727343 ,
0.80681604, 0.79451567, 0.74216524, 0.72462386, 0.75246873,
0.783864 , 0.72209091, 0.75536232, 0.77326468, 0.79433333,
0.78770476, 0.74157409, 0.76582761, 0.77708209, 0.81054075]), 'split6_test_score': array([0.70098586, 0.76163148, 0.76978355, 0.7546875 , 0.7392876 ,
0.79702504, 0.83723958, 0.80733618, 0.69880698, 0.72570975,
0.74526707, 0.80554545, 0.6900772 , 0.65643601, 0.75942029,
0.81071888, 0.8512608 , 0.73255411, 0.7590812 , 0.73616474,
0.69635171, 0.67806763, 0.79328757, 0.76425137, 0.72977539,
0.83381643, 0.73360656, 0.74666928, 0.71777269, 0.72038784,
0.80763285, 0.74868973, 0.72916667, 0.74435028, 0.75428571,
0.74159021, 0.78004834, 0.79054659, 0.73954545, 0.68733347,
0.81345843, 0.74906994, 0.716 , 0.72927273, 0.71182619,
0.77468969, 0.63150956, 0.80436829, 0.81472416, 0.72918182]), 'split7_test_score': array([0.78418803, 0.75810185, 0.74392633, 0.6809375 , 0.77449899,
0.66380952, 0.67281818, 0.68857089, 0.7659511 , 0.727343 ,
0.81686254, 0.75206044, 0.82739028, 0.80453183, 0.78478629,
0.7284834 , 0.76115942, 0.82341724, 0.72218407, 0.80143025,
0.78103896, 0.81062695, 0.736971 , 0.69787319, 0.71205357,
0.63426754, 0.72227665, 0.78143601, 0.74676724, 0.71161978,
0.7761537 , 0.7398795 , 0.81514498, 0.82418182, 0.75117852,
0.74357284, 0.75625105, 0.7710717 , 0.74820739, 0.77994792,
0.78610474, 0.698125 , 0.75018182, 0.79212382, 0.74379482,
0.71548228, 0.79394861, 0.73037093, 0.72935268, 0.77573006]), 'split8_test_score': array([0.77393211, 0.73283753, 0.78004892, 0.72917749, 0.70858614,
0.78497886, 0.77565217, 0.75534705, 0.76949451, 0.77576411,
0.66716997, 0.76668574, 0.77827273, 0.77178744, 0.74207621,
0.7230235 , 0.72541063, 0.72219122, 0.78164185, 0.8034897 ,
0.80947293, 0.76139601, 0.73699803, 0.77505636, 0.70101881,
0.76713438, 0.73023997, 0.74692641, 0.78970384, 0.74816652,
0.76034632, 0.74180328, 0.7427836 , 0.80747505, 0.80458333,
0.77953297, 0.75418527, 0.73272727, 0.75934661, 0.80031447,
0.79363636, 0.742 , 0.78888889, 0.72008547, 0.83657513,
0.81592868, 0.74212988, 0.66010074, 0.70663636, 0.7756543 ]), 'split9_test_score': array([0.74198718, 0.68461682, 0.79481312, 0.73861566, 0.81862453,
0.68358262, 0.75371854, 0.7387593 , 0.77608618, 0.82719097,
0.77668456, 0.77320612, 0.73320402, 0.71088435, 0.79490847,
0.78862407, 0.73415242, 0.79567124, 0.7767472 , 0.76642208,
0.72977539, 0.78354788, 0.71378128, 0.80926724, 0.83369693,
0.77996157, 0.73796509, 0.79700483, 0.77475045, 0.72756128,
0.80806636, 0.78279533, 0.72180248, 0.72854545, 0.77958333,
0.73481818, 0.74666667, 0.77388861, 0.74977018, 0.82616487,
0.75516908, 0.74678187, 0.80787911, 0.76070969, 0.73449275,
0.76420671, 0.75892857, 0.79954545, 0.70913419, 0.72777158]), 'mean_test_score': array([0.74946036, 0.74183019, 0.75611024, 0.74533344, 0.75299417,
0.74812085, 0.75284476, 0.75708134, 0.75113141, 0.77031319,
0.76004634, 0.76413373, 0.7688023 , 0.75693388, 0.75167791,
0.77008117, 0.76877077, 0.76007848, 0.77058587, 0.76414021,
0.74380859, 0.7599 , 0.75804899, 0.75808863, 0.75733217,
0.75710746, 0.73194828, 0.75668299, 0.7632457 , 0.7414231 ,
0.76714968, 0.74952229, 0.74698148, 0.77245877, 0.76796268,
0.75905574, 0.7631445 , 0.76061004, 0.74316271, 0.77422895,
0.77773497, 0.72378301, 0.76160378, 0.75957377, 0.7681964 ,
0.77181903, 0.73930102, 0.7566188 , 0.7566557 , 0.76653946]), 'std_test_score': array([0.04829957, 0.02979282, 0.04096371, 0.0355812 , 0.03071986,
0.04478263, 0.04225958, 0.04322252, 0.0318887 , 0.04418244,
0.04322679, 0.02449728, 0.03408817, 0.04830634, 0.03283612,
0.02810647, 0.0394414 , 0.03221666, 0.03150036, 0.0372332 ,
0.03836411, 0.03802983, 0.03712452, 0.02932401, 0.04595749,
0.0526524 , 0.01830071, 0.02920632, 0.03732979, 0.03147078,
0.04102692, 0.03761324, 0.03246968, 0.03824386, 0.02243328,
0.0225768 , 0.0367286 , 0.02780247, 0.01537739, 0.04138127,
0.0286516 , 0.02096302, 0.02381341, 0.03034623, 0.03571163,
0.02508913, 0.05729716, 0.05688737, 0.04109969, 0.02955781]), 'rank_test_score': array([40, 46, 34, 43, 35, 41, 36, 29, 38, 6, 21, 15, 8, 30, 37, 7, 9,
20, 5, 14, 44, 22, 26, 25, 27, 28, 49, 31, 16, 47, 12, 39, 42, 3,
11, 24, 17, 19, 45, 2, 1, 50, 18, 23, 10, 4, 48, 33, 32, 13],
dtype=int32)}
print(bayes_search.best_score_)
0.7777349689095895
print(bayes_search.best_params_)
OrderedDict([('model__max_features', 0.001), ('model__n_estimators', 3217), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())])
new_param_column_names = {
'model__max_features': 'max_features',
'model__n_estimators': 'n_estimators',
# 'pca__transformer': 'pca',
# 'pca__n_components': 'pca: n_comps',
# 'pca__gamma': 'pca: gamma',
# 'pca__kernel': 'pca: kernel',
'prep__non_numeric__encoder__transformer': 'encoder',
'prep__numeric__imputer__transformer': 'imputer',
'prep__numeric__scaler__transformer': 'scaler'
}
results = hlp.sklearn_eval.MLExperimentResults.from_sklearn_search_cv(
searcher=bayes_search,
higher_score_is_better = True,
parameter_name_mappings = new_param_column_names
)
results.to_yaml_file(yaml_file_name = 'Run 1 - Random Forest - BayesSearchCV.yaml')
results = hlp.sklearn_eval.MLExperimentResults.from_yaml_file(yaml_file_name = 'Run 1 - Random Forest - BayesSearchCV.yaml')
results.fit_time_averages
array([ 6.3377691 , 12.61328192, 9.83223152, 1.99773061, 4.71405914,
12.63259985, 9.39273531, 10.47257104, 8.08451636, 4.86580131,
2.67155662, 6.27625751, 5.65203233, 0.08725915, 0.11712933,
4.96601081, 2.81815062, 6.04031501, 5.88370337, 6.04077718,
0.0851758 , 4.58678215, 4.40100007, 3.36408961, 8.21608126,
5.38535712, 0.28405123, 6.83285525, 6.16183512, 0.09648895,
4.68545451, 3.00817547, 15.69353282, 3.24381421, 2.87393954,
3.0361279 , 3.9013505 , 5.97755063, 12.88534379, 2.2311533 ,
4.078633 , 0.16469088, 2.17728469, 2.51503088, 4.00390766,
4.26460209, 8.43094141, 0.30152497, 0.21109216, 4.27965353])
results.best_primary_score
0.7777349689095895
results.best_primary_score_params
{'max_features': 0.001,
'n_estimators': 3217,
'encoder': 'CustomOrdinalEncoder()',
'imputer': 'SimpleImputer()',
'scaler': 'StandardScaler()'}
results.to_formatted_dataframe(num_rows=30)
| roc_auc Mean | roc_auc 95CI.LO | roc_auc 95CI.HI | max_features | n_estimators | encoder | scaler |
|---|---|---|---|---|---|---|
| 0.778 | 0.757 | 0.798 | 0.001 | 3,217 | CustomOrdinalEncoder() | StandardScaler() |
| 0.774 | 0.745 | 0.804 | 0.003 | 1,729 | CustomOrdinalEncoder() | StandardScaler() |
| 0.772 | 0.745 | 0.800 | 0.002 | 2,683 | CustomOrdinalEncoder() | MinMaxScaler() |
| 0.772 | 0.754 | 0.790 | 0.001 | 3,263 | CustomOrdinalEncoder() | StandardScaler() |
| 0.771 | 0.748 | 0.793 | 0.001 | 4,805 | CustomOrdinalEncoder() | MinMaxScaler() |
| 0.770 | 0.739 | 0.802 | 0.005 | 4,098 | CustomOrdinalEncoder() | MinMaxScaler() |
| 0.770 | 0.750 | 0.790 | 0.124 | 3,452 | OneHotEncoder() | None |
| 0.769 | 0.744 | 0.793 | 0.001 | 4,571 | OneHotEncoder() | None |
| 0.769 | 0.741 | 0.797 | 0.216 | 2,021 | CustomOrdinalEncoder() | MinMaxScaler() |
| 0.768 | 0.743 | 0.794 | 0.001 | 3,269 | CustomOrdinalEncoder() | StandardScaler() |
| 0.768 | 0.752 | 0.784 | 0.004 | 2,286 | CustomOrdinalEncoder() | MinMaxScaler() |
| 0.767 | 0.738 | 0.796 | 0.001 | 3,865 | CustomOrdinalEncoder() | StandardScaler() |
| 0.767 | 0.745 | 0.788 | 0.001 | 3,272 | CustomOrdinalEncoder() | StandardScaler() |
| 0.764 | 0.738 | 0.791 | 0.001 | 5,000 | CustomOrdinalEncoder() | MinMaxScaler() |
| 0.764 | 0.747 | 0.782 | 0.134 | 4,977 | CustomOrdinalEncoder() | StandardScaler() |
| 0.763 | 0.737 | 0.790 | 0.001 | 5,000 | CustomOrdinalEncoder() | None |
| 0.763 | 0.737 | 0.789 | 0.001 | 3,205 | OneHotEncoder() | MinMaxScaler() |
| 0.762 | 0.745 | 0.779 | 0.001 | 1,586 | CustomOrdinalEncoder() | MinMaxScaler() |
| 0.761 | 0.741 | 0.780 | 0.001 | 4,871 | OneHotEncoder() | StandardScaler() |
| 0.760 | 0.737 | 0.783 | 0.004 | 4,990 | OneHotEncoder() | None |
| 0.760 | 0.729 | 0.791 | 0.001 | 2,219 | CustomOrdinalEncoder() | StandardScaler() |
| 0.760 | 0.733 | 0.787 | 0.201 | 3,245 | CustomOrdinalEncoder() | MinMaxScaler() |
| 0.760 | 0.738 | 0.781 | 0.001 | 2,004 | OneHotEncoder() | MinMaxScaler() |
| 0.759 | 0.743 | 0.775 | 0.003 | 2,487 | OneHotEncoder() | None |
| 0.758 | 0.737 | 0.779 | 0.001 | 2,733 | OneHotEncoder() | None |
| 0.758 | 0.731 | 0.785 | 0.031 | 3,603 | OneHotEncoder() | StandardScaler() |
| 0.757 | 0.724 | 0.790 | 0.239 | 4,964 | OneHotEncoder() | StandardScaler() |
| 0.757 | 0.719 | 0.795 | 0.109 | 4,128 | CustomOrdinalEncoder() | None |
| 0.757 | 0.726 | 0.788 | 0.538 | 4,604 | OneHotEncoder() | MinMaxScaler() |
| 0.757 | 0.722 | 0.791 | 0.149 | 50 | OneHotEncoder() | MinMaxScaler() |
# gives the score rank for each index
# e.g. array([4, 2, 1, 3)
# the 1st iteration (i.e. set of params) was the worst
# the 3rd iteration was the best.
results.primary_score_trial_ranking
array([40, 46, 34, 43, 35, 41, 36, 29, 38, 6, 21, 15, 8, 30, 37, 7, 9,
20, 5, 14, 44, 22, 26, 25, 27, 28, 49, 31, 16, 47, 12, 39, 42, 3,
11, 24, 17, 19, 45, 2, 1, 50, 18, 23, 10, 4, 48, 33, 32, 13])
# gives the
# e.g. parser.primary_score_iteration_ranking of array([4, 2, 1, 3)
# would return [2, 1, 4, 0] because index 2 (i.e. 3rd iteration) was the best, so it is the first index;
# and index 0 (i.e. first iteration) was the was
results.primary_score_best_indexes
array([40, 39, 33, 45, 18, 9, 15, 12, 16, 44, 34, 30, 49, 19, 11, 28, 36,
42, 37, 17, 10, 21, 43, 35, 23, 22, 24, 25, 7, 13, 27, 48, 47, 2,
4, 6, 14, 8, 31, 0, 5, 32, 3, 20, 38, 1, 29, 46, 26, 41])
results.plot_performance_across_trials().show()
results.plot_performance_across_trials(size='n_estimators', color='max_features').show()
results.plot_parameter_values_across_trials().show()
results.plot_scatter_matrix(height=600, width=600 * hlp.plot.GOLDEN_RATIO).show()
results.plot_performance_numeric_params()
results.plot_parallel_coordinates().show()
results.plot_performance_non_numeric_params()
results.plot_score_vs_parameter(
parameter='max_features',
# size='colsample_bytree',
color='n_estimators'
)
results.plot_parameter_vs_parameter(
parameter_x='n_estimators',
parameter_y='max_features',
)
roc_auc Mean¶score_variable = results.primary_score_name + ' Mean'
score_dataframe = results.to_dataframe()
score_dataframe = score_dataframe.drop(columns=[x for x in score_dataframe.columns
if x not in [score_variable] + results.parameter_names])
score_dataframe.head()
| roc_auc Mean | max_features | n_estimators | encoder | scaler | |
|---|---|---|---|---|---|
| 40 | 0.777735 | 0.001000 | 3217 | CustomOrdinalEncoder() | StandardScaler() |
| 39 | 0.774229 | 0.002885 | 1729 | CustomOrdinalEncoder() | StandardScaler() |
| 33 | 0.772459 | 0.002131 | 2683 | CustomOrdinalEncoder() | MinMaxScaler() |
| 45 | 0.771819 | 0.001000 | 3263 | CustomOrdinalEncoder() | StandardScaler() |
| 18 | 0.770586 | 0.001000 | 4805 | CustomOrdinalEncoder() | MinMaxScaler() |
cleaned_column_names = [''.join(e for e in x.replace(' ', '_') if e == '_' or e.isalnum()) for x in score_dataframe.columns.tolist()]
cleaned_column_names = dict(zip(score_dataframe.columns.tolist(), cleaned_column_names))
cleaned_column_names
{'roc_auc Mean': 'roc_auc_Mean',
'max_features': 'max_features',
'n_estimators': 'n_estimators',
'encoder': 'encoder',
'scaler': 'scaler'}
score_dataframe = score_dataframe.rename(columns=cleaned_column_names)
import statsmodels.formula.api as smf
y_column = 'roc_auc_Mean'
X_columns = score_dataframe.columns.tolist()
X_columns.remove(y_column)
X_columns = hlp.string.collapse(X_columns, separate=" + ", surround="")
formula = f"{y_column} ~ {X_columns}"
print(formula)
model = smf.ols(formula=formula, data = score_dataframe)
results = model.fit()
print(results.summary())
roc_auc_Mean ~ max_features + n_estimators + encoder + scaler
OLS Regression Results
==============================================================================
Dep. Variable: roc_auc_Mean R-squared: 0.697
Model: OLS Adj. R-squared: 0.663
Method: Least Squares F-statistic: 20.29
Date: Mon, 31 Jan 2022 Prob (F-statistic): 1.90e-10
Time: 09:13:53 Log-Likelihood: 184.29
No. Observations: 50 AIC: -356.6
Df Residuals: 44 BIC: -345.1
Df Model: 5
Covariance Type: nonrobust
==============================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------------
Intercept 0.7587 0.002 337.565 0.000 0.754 0.763
encoder[T.OneHotEncoder()] -0.0025 0.002 -1.259 0.215 -0.006 0.001
scaler[T.None] -0.0037 0.003 -1.464 0.150 -0.009 0.001
scaler[T.StandardScaler()] 0.0006 0.002 0.297 0.768 -0.004 0.005
max_features -0.0226 0.003 -8.423 0.000 -0.028 -0.017
n_estimators 2.133e-06 5.61e-07 3.804 0.000 1e-06 3.26e-06
==============================================================================
Omnibus: 0.075 Durbin-Watson: 0.964
Prob(Omnibus): 0.963 Jarque-Bera (JB): 0.033
Skew: -0.036 Prob(JB): 0.984
Kurtosis: 2.898 Cond. No. 1.24e+04
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.24e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
import pandas as pd
scaler = StandardScaler()
#scaler.fit_transform(bayes_search_df)
numeric_columns = hlp.pandas.get_numeric_columns(score_dataframe)
non_numeric_columns = hlp.pandas.get_non_numeric_columns(score_dataframe)
print(numeric_columns)
print(non_numeric_columns)
numeric_pipeline = Pipeline([
('scaling', StandardScaler()),
])
transformations_pipeline = ColumnTransformer([
('numeric_pipeline', numeric_pipeline, numeric_columns),
('non_numeric_pipeline', 'passthrough', non_numeric_columns)
])
score_dataframe_transformed = transformations_pipeline.fit_transform(score_dataframe)
score_dataframe_transformed = pd.DataFrame(score_dataframe_transformed,
columns= numeric_columns + non_numeric_columns)
score_dataframe_transformed.head()
['roc_auc_Mean', 'max_features', 'n_estimators'] ['encoder', 'scaler']
| roc_auc_Mean | max_features | n_estimators | encoder | scaler | |
|---|---|---|---|---|---|
| 0 | 1.835755 | -0.728413 | 0.134576 | CustomOrdinalEncoder() | StandardScaler() |
| 1 | 1.517946 | -0.723131 | -0.759396 | CustomOrdinalEncoder() | StandardScaler() |
| 2 | 1.357486 | -0.725245 | -0.186244 | CustomOrdinalEncoder() | MinMaxScaler() |
| 3 | 1.299496 | -0.728413 | 0.162213 | CustomOrdinalEncoder() | StandardScaler() |
| 4 | 1.187714 | -0.728413 | 1.088628 | CustomOrdinalEncoder() | MinMaxScaler() |
score_dataframe_transformed['roc_auc_Mean'] = score_dataframe_transformed['roc_auc_Mean'].astype('float')
score_dataframe_transformed['max_features'] = score_dataframe_transformed['max_features'].astype('float')
score_dataframe_transformed['n_estimators'] = score_dataframe_transformed['n_estimators'].astype('float')
print(formula)
model = smf.ols(formula=formula,
data = score_dataframe_transformed)
results = model.fit()
print(results.summary())
roc_auc_Mean ~ max_features + n_estimators + encoder + scaler
OLS Regression Results
==============================================================================
Dep. Variable: roc_auc_Mean R-squared: 0.697
Model: OLS Adj. R-squared: 0.663
Method: Least Squares F-statistic: 20.29
Date: Mon, 31 Jan 2022 Prob (F-statistic): 1.90e-10
Time: 09:13:54 Log-Likelihood: -41.055
No. Observations: 50 AIC: 94.11
Df Residuals: 44 BIC: 105.6
Df Model: 5
Covariance Type: nonrobust
==============================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------------
Intercept 0.1588 0.142 1.120 0.269 -0.127 0.445
encoder[T.OneHotEncoder()] -0.2251 0.179 -1.259 0.215 -0.585 0.135
scaler[T.None] -0.3319 0.227 -1.464 0.150 -0.789 0.125
scaler[T.StandardScaler()] 0.0584 0.197 0.297 0.768 -0.338 0.455
max_features -0.7306 0.087 -8.423 0.000 -0.905 -0.556
n_estimators 0.3218 0.085 3.804 0.000 0.151 0.492
==============================================================================
Omnibus: 0.075 Durbin-Watson: 0.964
Prob(Omnibus): 0.963 Jarque-Bera (JB): 0.033
Skew: -0.036 Prob(JB): 0.984
Kurtosis: 2.898 Cond. No. 3.96
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
coefficients = pd.DataFrame({
'feature': results.params.index,
'coefficient': results.params,
'p_value': results.pvalues,
})
coefficients = coefficients.query("feature != 'Intercept'")
coefficients['Stat Sig'] = coefficients['p_value'] <= 0.05
coefficients
| feature | coefficient | p_value | Stat Sig | |
|---|---|---|---|---|
| encoder[T.OneHotEncoder()] | encoder[T.OneHotEncoder()] | -0.225065 | 2.146325e-01 | False |
| scaler[T.None] | scaler[T.None] | -0.331883 | 1.501781e-01 | False |
| scaler[T.StandardScaler()] | scaler[T.StandardScaler()] | 0.058354 | 7.681092e-01 | False |
| max_features | max_features | -0.730597 | 1.003778e-10 | True |
| n_estimators | n_estimators | 0.321810 | 4.360650e-04 | True |
score_variable
'roc_auc Mean'
px.bar(
data_frame=coefficients.reindex(coefficients['coefficient'].abs().sort_values(ascending=True).index),
y='feature',
x='coefficient',
color='Stat Sig',
title=f"Regression Coefficients of Hyper-parameters against '{score_variable}'",
height=600,
width=600*hlp.plot.GOLDEN_RATIO
)
from sklearn.inspection import permutation_importance
estimator = bayes_search.best_estimator_
start_time = time.time()
result = permutation_importance(
estimator, X_train, y_train, n_repeats=10, random_state=42, n_jobs=2
)
elapsed_time = time.time() - start_time
print(f"Elapsed time to compute the importances: {elapsed_time:.3f} seconds")
feature_names = X_train.columns.to_list()
forest_importances = pd.Series(result.importances_mean, index=feature_names)
forest_importances = forest_importances.sort_values(ascending=False)
Elapsed time to compute the importances: 48.509 seconds
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.set_size_inches(9, 6)
fig.tight_layout()
plt.show()
temp = X_train.copy()
temp['default'] = y_train
temp.groupby('foreign_worker').agg({'default': np.mean})
| default | |
|---|---|
| foreign_worker | |
| yes | 0.308290 |
| no | 0.107143 |
fig = px.box(
data_frame=temp,
y='age',
x='default',
# size=size_variable,
# color=color_variable,
# trendline='lowess',
# labels={
# score_variable: f"Average Cross Validation Score ({parser.primary_score_name})",
# },
# title=f"<b>{x_variable}</b> - Performance<br>" \
# f"<sup>Size of point corresponds to '{size_variable}'</sup>",
# custom_data=['labels'],
height=600,
width=600*hlp.plot.GOLDEN_RATIO
)
fig.show()
NOTE: foreign worker seems like it should be important but is ranked last in feature importance.